import pandas as pd


# 知识：Pandas的GroupBy遵从split、apply、combine模式¶

# 这里的split指的是pandas的groupby，我们自己实现apply函数，apply返回的结果由pandas进行combine得到结果
# GroupBy.apply(function)
# function的第一个参数是dataframe
# function的返回结果，可是dataframe、series、单个值，甚至和输入dataframe完全没关系


# 演示：用户对电影评分的归一化¶
def apply_demo():
    filepath = "../datas/movielens-1m/ratings.dat"
    rating = pd.read_csv(filepath, sep="::", engine="python", names="UserID::MovieID::Rating::Timestamp".split("::"))
    # print(rating.head())
    return rating


# 实现按照用户ID分组，然后对其中一列归一化
def ratings_norm(df):
    # @param df：每个用户分组的dataframe
    min_val = df['Rating'].min()
    max_val = df['Rating'].max()
    df['Rating_norm'] = df['Rating'].apply(lambda x: (x - min_val) / (max_val - min_val))
    return df


# 实例2：怎样取每个分组的TOPN数据？ 按月分组  取出每个月 温度最高的 数据
def tianqi_Demo():
    filepath = "../datas/beijing_tianqi/beijing_tianqi_2018.csv"
    df = pd.read_csv(filepath)
    # 替换掉温度的后缀℃
    df.loc[:, "bWendu"] = df["bWendu"].str.replace("℃", "").astype('int32')
    df.loc[:, "yWendu"] = df["yWendu"].str.replace("℃", "").astype('int32')
    # 新增一列为月份
    df['month'] = df['ymd'].str[:7]
    # print(df.head())
    # print(df.groupby("month").head())
    print(df.groupby("month").apply(getWenduTopN, top=-1))
def getWenduTopN(df,top):
    return df.sort_values(by="bWendu")[["ymd", "bWendu"]][top:]



if __name__ == "__main__":
    # rating = apply_demo()
    # print(rating)
    # res = rating.groupby('UserID').apply(ratings_norm)
    # # 可以看到UserID==1这个用户，Rating==3是他的最低分，是个乐观派，我们归一化到0分；
    # print(res[res["UserID"]==1].head())
    tianqi_Demo()
